In [1]:
    
%matplotlib inline
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import os
    
Test 2
Input Features: x, x^2
Output / Target: y_noisy
Objective: How adding relevant features improves predicting accuracy
In [2]:
    
def quad_func (x):
    return 5 * x ** 2 -23 * x + 47
    
In [3]:
    
# Training Set + Eval Set: 200 samples (70%, 30% split)
# Test Set: 60 samples
# Total: 260 samples
    
In [4]:
    
np.random.seed(5)
samples = 260
x_vals = pd.Series(np.random.rand(samples) * 20)
x2_vals = x_vals ** 2
y_vals = x_vals.map(quad_func)
y_noisy_vals = y_vals + np.random.randn(samples) * 50
    
In [5]:
    
df = pd.DataFrame({'x': x_vals, 
                   'x2': x2_vals ,
                   'y': y_vals, 
                   'y_noisy': y_noisy_vals})
    
In [6]:
    
df.head()
    
    Out[6]:
In [7]:
    
df.corr()
    
    Out[7]:
In [8]:
    
fig = plt.figure(figsize = (12, 8))
plt.scatter(x = df['x'],
            y = df['y'],
            color = 'r',
            label = 'y',)
plt.scatter(x = df['x'],
            y = df['y_noisy'],
            color = 'b',
            label = 'y noisy', 
            marker = '+')
plt.xlabel('x')
plt.ylabel('Target Attribute')
plt.grid(True)
plt.legend()
    
    Out[8]:
    
In [9]:
    
data_path = '..\Data\RegressionExamples\quadratic'
    
In [10]:
    
df.to_csv(os.path.join(data_path,'quadratic_example_all.csv'),
          index = True,
          index_label = 'Row')
    
In [11]:
    
df[df.index < 200].to_csv(os.path.join(data_path, 'quadratic_example_train_underfit.csv'),
                          index = True,
                          index_label = 'Row', 
                          columns = ['x', 'y_noisy'])
    
In [12]:
    
df[df.index < 200].to_csv(os.path.join(data_path, 'quadratic_example_train_normal.csv'),
                          index = True,
                          index_label = 'Row',
                          columns= ['x', 'x2', 'y_noisy'])
    
In [13]:
    
df.to_csv(os.path.join(data_path, 'quadratic_example_test_all_underfit.csv'), 
          index = True,
          index_label = 'Row', 
          columns = ['x'])
    
In [14]:
    
df.to_csv(os.path.join(data_path, 'quadratic_example_test_all_normal.csv'),
          index = True,
          index_label = 'Row', 
          columns = ['x', 'x2'])
    
In [15]:
    
# Pull Predictions
# Prediction without quadratic term
df = pd.read_csv(os.path.join(data_path,'quadratic_example_all.csv'), 
                 index_col = 'Row')
df_predicted_underfit = pd.read_csv(os.path.join(data_path, 'output_underfit',
                                                 'bp-pNYIAR35aSV-quadratic_example_test_all_underfit.csv.gz'))
df_predicted_underfit.columns = ["Row", "y_predicted"]
    
In [16]:
    
fig = plt.figure(figsize = (12, 8))
plt.scatter(x = df.x,
            y = df.y_noisy,
            color = 'b',
            label = 'actual', 
            marker = '+')
plt.scatter(x = df.x,
            y = df_predicted_underfit.y_predicted ,
            color = 'g',
            label = 'Fit (x)',
            marker = '^')
plt.title('Quadratic - underfit')
plt.xlabel('x')
plt.ylabel('Target Attribute')
plt.grid(True)
plt.legend()
    
    Out[16]:
    
Test 1: Training RMSE: 385.18, Evaluation RMSE: 257.89, Baseline RMSE: 437.31 Wojciech results: Training RMSE: 385.16, Evaluation RMSE: 257.898, Baseline RMSE: 437.311
RMSE for the model is large and closer to baseline
In [17]:
    
fig = plt.figure(figsize = (12, 8))
plt.boxplot([df.y_noisy, df_predicted_underfit.y_predicted], 
            labels = ['actual','predicted-underfit'])
plt.title('Box Plot - Actual, Predicted')
plt.ylabel('y')
plt.grid(True)
    
    
In [18]:
    
df.y_noisy.describe()
    
    Out[18]:
In [19]:
    
df_predicted_underfit.y_predicted.describe()
    
    Out[19]:
In [20]:
    
df_predicted_normal = pd.read_csv(os.path.join(data_path,'output_normal',
                                               'bp-In6EUvWaCw2-quadratic_example_test_all_normal.csv.gz'))
df_predicted_normal.columns = ["Row", "y_predicted"]
    
In [21]:
    
fig = plt.figure(figsize = (12, 8))
plt.scatter(x = df.x,
            y = df.y_noisy,
            color = 'b',
            label = 'actual', 
            marker ='+')
plt.scatter(x = df.x,
            y = df_predicted_underfit.y_predicted,
            color = 'g',
            label = 'Fit (x)',
            marker = '^')
plt.scatter(x = df.x ,
            y = df_predicted_normal.y_predicted ,
            color = 'r',
            label = 'Fit (x,x^2)')
plt.title('Quadratic - normal fit')
plt.grid(True)
plt.xlabel('x')
plt.ylabel('Target Attribute')
#plt.legend()
    
    Out[21]:
    
Test 1: Training RMSE: 385.16, Evaluation RMSE: 257.89, Baseline RMSE: 437.31
Test 2: Training RMSE: 132.20, Evaluation RMSE: 63.68, Baseline RMSE: 437.31
Test 2 RMSE is much better compared to baseline. Do note that we added approx -50 to 50 noise value to y
In [22]:
    
fig = plt.figure(figsize = (12, 8))
plt.boxplot([df.y_noisy,df_predicted_underfit.y_predicted, df_predicted_normal.y_predicted], 
            labels = ['actual','predicted-underfit','predicted-normal'])
plt.title('Box Plot - Actual, Predicted')
plt.ylabel('y')
plt.grid(True)
    
    
In [23]:
    
df_predicted_underfit.head()
    
    Out[23]:
In [24]:
    
df_predicted_normal.head()
    
    Out[24]: